{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-03-12T02:21:52.022973Z",
     "start_time": "2019-03-12T02:21:48.983736Z"
    }
   },
   "outputs": [],
   "source": [
    "##==================== 导入必要的工具包 ====================##\n",
    "import pandas as pd \n",
    "import numpy as np\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-03-12T02:23:00.907890Z",
     "start_time": "2019-03-12T02:21:53.524234Z"
    }
   },
   "outputs": [],
   "source": [
    "# 读入数据，这里读取了500万行，为的是方便本地运行，且符合kaggle提交结果至少470万行的要求\n",
    "# 且需要注意，读入的时候，要把id 读成 str 格式，为的是最后生成的Submission中的id不是科学计数法表示，否则，kaggle会报错\n",
    "\n",
    "train = pd.read_csv('train.csv',nrows=5000000,dtype={'id':str})\n",
    "test = pd.read_csv('test.csv',nrows=5000000,dtype={'id':str})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-03-12T02:23:56.686562Z",
     "start_time": "2019-03-12T02:23:45.411037Z"
    }
   },
   "outputs": [],
   "source": [
    "##==================== Create 'date' feature ====================##\n",
    "\n",
    "# hour: format is YYMMDDHH, so 14102209 means 09:00 on Oct. 22, 2014 UTC. （时间）\n",
    "# 2014-10-21-00:00 (Tuesday) -- 2014-10-30-23:00 (Thursday)\n",
    "# Weekends are 2014-10-25,26 (Saturday and Sunday)\n",
    "# Convert 14102422 => 24\n",
    "# x % 100000 = 2422; x % 100 = 22; 2422-22 = 2400; int(2400/100) = 24.\n",
    "train['date'] = train['hour'].apply(lambda x: int(((x % 100000)-(x % 100)) / 100)) \n",
    "test['date'] = test['hour'].apply(lambda x: int(((x % 100000)-(x % 100)) / 100)) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-03-12T02:24:03.637172Z",
     "start_time": "2019-03-12T02:23:58.454621Z"
    }
   },
   "outputs": [],
   "source": [
    "##==================== Create 'time_period' feature ====================##\n",
    "\n",
    "train['time'] = train['hour'].apply(lambda x: x % 100) # means: 14102422 => 22\n",
    "test['time'] = test['hour'].apply(lambda x: x % 100) # means: 14102422 => 22"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-03-12T02:24:11.394585Z",
     "start_time": "2019-03-12T02:24:06.670922Z"
    }
   },
   "outputs": [],
   "source": [
    "train['time_period'] = train['time'].apply(lambda x: 0 if x<6 else (1 if x<12 else (2 if x<18 else 3))) # means: 14102422 => 22\n",
    "test['time_period'] = test['time'].apply(lambda x: 0 if x<6 else (1 if x<12 else (2 if x<18 else 3))) # means: 14102422 => 22"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-03-12T02:25:02.858033Z",
     "start_time": "2019-03-12T02:24:14.135964Z"
    }
   },
   "outputs": [],
   "source": [
    "# Check whether conversion is correct:\n",
    "# train[train['time_period']==1]['time'] \n",
    "# Delete the 'time' feature.\n",
    "train.drop('time',axis=1,inplace=True)\n",
    "test.drop('time',axis=1,inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-03-12T02:27:22.318110Z",
     "start_time": "2019-03-12T02:26:17.464032Z"
    }
   },
   "outputs": [],
   "source": [
    "##==================== Create 'weekday' feature ====================##\n",
    "train['weekday'] = train['date'].apply(lambda x: 0 if x==25 or x==26 else 1) # Only dates 25 and 26 are weekends.\n",
    "# Check whether conversion is correct.\n",
    "# train[train['weekday']==0]['date'] \n",
    "# Delete the 'hour' feature.\n",
    "train.drop('hour',axis=1,inplace=True)\n",
    "\n",
    "\n",
    "test['weekday'] = test['date'].apply(lambda x: 0 if x==25 or x==26 else 1) # Only dates 25 and 26 are weekends.\n",
    "# Check whether conversion is correct.\n",
    "# train[train['weekday']==0]['date'] \n",
    "# Delete the 'hour' feature.\n",
    "test.drop('hour',axis=1,inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-03-12T02:27:44.156879Z",
     "start_time": "2019-03-12T02:27:43.890085Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>click</th>\n",
       "      <th>C1</th>\n",
       "      <th>banner_pos</th>\n",
       "      <th>site_id</th>\n",
       "      <th>site_domain</th>\n",
       "      <th>site_category</th>\n",
       "      <th>app_id</th>\n",
       "      <th>app_domain</th>\n",
       "      <th>app_category</th>\n",
       "      <th>...</th>\n",
       "      <th>C15</th>\n",
       "      <th>C16</th>\n",
       "      <th>C17</th>\n",
       "      <th>C18</th>\n",
       "      <th>C19</th>\n",
       "      <th>C20</th>\n",
       "      <th>C21</th>\n",
       "      <th>date</th>\n",
       "      <th>time_period</th>\n",
       "      <th>weekday</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1000009418151094273</td>\n",
       "      <td>0</td>\n",
       "      <td>1005</td>\n",
       "      <td>0</td>\n",
       "      <td>1fbe01fe</td>\n",
       "      <td>f3845767</td>\n",
       "      <td>28905ebd</td>\n",
       "      <td>ecad2386</td>\n",
       "      <td>7801e8d9</td>\n",
       "      <td>07d7df22</td>\n",
       "      <td>...</td>\n",
       "      <td>320</td>\n",
       "      <td>50</td>\n",
       "      <td>1722</td>\n",
       "      <td>0</td>\n",
       "      <td>35</td>\n",
       "      <td>-1</td>\n",
       "      <td>79</td>\n",
       "      <td>21</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>10000169349117863715</td>\n",
       "      <td>0</td>\n",
       "      <td>1005</td>\n",
       "      <td>0</td>\n",
       "      <td>1fbe01fe</td>\n",
       "      <td>f3845767</td>\n",
       "      <td>28905ebd</td>\n",
       "      <td>ecad2386</td>\n",
       "      <td>7801e8d9</td>\n",
       "      <td>07d7df22</td>\n",
       "      <td>...</td>\n",
       "      <td>320</td>\n",
       "      <td>50</td>\n",
       "      <td>1722</td>\n",
       "      <td>0</td>\n",
       "      <td>35</td>\n",
       "      <td>100084</td>\n",
       "      <td>79</td>\n",
       "      <td>21</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>10000371904215119486</td>\n",
       "      <td>0</td>\n",
       "      <td>1005</td>\n",
       "      <td>0</td>\n",
       "      <td>1fbe01fe</td>\n",
       "      <td>f3845767</td>\n",
       "      <td>28905ebd</td>\n",
       "      <td>ecad2386</td>\n",
       "      <td>7801e8d9</td>\n",
       "      <td>07d7df22</td>\n",
       "      <td>...</td>\n",
       "      <td>320</td>\n",
       "      <td>50</td>\n",
       "      <td>1722</td>\n",
       "      <td>0</td>\n",
       "      <td>35</td>\n",
       "      <td>100084</td>\n",
       "      <td>79</td>\n",
       "      <td>21</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>10000640724480838376</td>\n",
       "      <td>0</td>\n",
       "      <td>1005</td>\n",
       "      <td>0</td>\n",
       "      <td>1fbe01fe</td>\n",
       "      <td>f3845767</td>\n",
       "      <td>28905ebd</td>\n",
       "      <td>ecad2386</td>\n",
       "      <td>7801e8d9</td>\n",
       "      <td>07d7df22</td>\n",
       "      <td>...</td>\n",
       "      <td>320</td>\n",
       "      <td>50</td>\n",
       "      <td>1722</td>\n",
       "      <td>0</td>\n",
       "      <td>35</td>\n",
       "      <td>100084</td>\n",
       "      <td>79</td>\n",
       "      <td>21</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>10000679056417042096</td>\n",
       "      <td>0</td>\n",
       "      <td>1005</td>\n",
       "      <td>1</td>\n",
       "      <td>fe8cc448</td>\n",
       "      <td>9166c161</td>\n",
       "      <td>0569f928</td>\n",
       "      <td>ecad2386</td>\n",
       "      <td>7801e8d9</td>\n",
       "      <td>07d7df22</td>\n",
       "      <td>...</td>\n",
       "      <td>320</td>\n",
       "      <td>50</td>\n",
       "      <td>2161</td>\n",
       "      <td>0</td>\n",
       "      <td>35</td>\n",
       "      <td>-1</td>\n",
       "      <td>157</td>\n",
       "      <td>21</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 26 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                     id  click    C1  banner_pos   site_id site_domain  \\\n",
       "0   1000009418151094273      0  1005           0  1fbe01fe    f3845767   \n",
       "1  10000169349117863715      0  1005           0  1fbe01fe    f3845767   \n",
       "2  10000371904215119486      0  1005           0  1fbe01fe    f3845767   \n",
       "3  10000640724480838376      0  1005           0  1fbe01fe    f3845767   \n",
       "4  10000679056417042096      0  1005           1  fe8cc448    9166c161   \n",
       "\n",
       "  site_category    app_id app_domain app_category  ...  C15 C16   C17  C18  \\\n",
       "0      28905ebd  ecad2386   7801e8d9     07d7df22  ...  320  50  1722    0   \n",
       "1      28905ebd  ecad2386   7801e8d9     07d7df22  ...  320  50  1722    0   \n",
       "2      28905ebd  ecad2386   7801e8d9     07d7df22  ...  320  50  1722    0   \n",
       "3      28905ebd  ecad2386   7801e8d9     07d7df22  ...  320  50  1722    0   \n",
       "4      0569f928  ecad2386   7801e8d9     07d7df22  ...  320  50  2161    0   \n",
       "\n",
       "   C19     C20  C21  date  time_period  weekday  \n",
       "0   35      -1   79    21            0        1  \n",
       "1   35  100084   79    21            0        1  \n",
       "2   35  100084   79    21            0        1  \n",
       "3   35  100084   79    21            0        1  \n",
       "4   35      -1  157    21            0        1  \n",
       "\n",
       "[5 rows x 26 columns]"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-03-12T02:27:47.562292Z",
     "start_time": "2019-03-12T02:27:47.533383Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>C1</th>\n",
       "      <th>banner_pos</th>\n",
       "      <th>site_id</th>\n",
       "      <th>site_domain</th>\n",
       "      <th>site_category</th>\n",
       "      <th>app_id</th>\n",
       "      <th>app_domain</th>\n",
       "      <th>app_category</th>\n",
       "      <th>device_id</th>\n",
       "      <th>...</th>\n",
       "      <th>C15</th>\n",
       "      <th>C16</th>\n",
       "      <th>C17</th>\n",
       "      <th>C18</th>\n",
       "      <th>C19</th>\n",
       "      <th>C20</th>\n",
       "      <th>C21</th>\n",
       "      <th>date</th>\n",
       "      <th>time_period</th>\n",
       "      <th>weekday</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>10000174058809263569</td>\n",
       "      <td>1005</td>\n",
       "      <td>0</td>\n",
       "      <td>235ba823</td>\n",
       "      <td>f6ebf28e</td>\n",
       "      <td>f028772b</td>\n",
       "      <td>ecad2386</td>\n",
       "      <td>7801e8d9</td>\n",
       "      <td>07d7df22</td>\n",
       "      <td>a99f214a</td>\n",
       "      <td>...</td>\n",
       "      <td>320</td>\n",
       "      <td>50</td>\n",
       "      <td>761</td>\n",
       "      <td>3</td>\n",
       "      <td>175</td>\n",
       "      <td>100075</td>\n",
       "      <td>23</td>\n",
       "      <td>31</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>10000182526920855428</td>\n",
       "      <td>1005</td>\n",
       "      <td>0</td>\n",
       "      <td>1fbe01fe</td>\n",
       "      <td>f3845767</td>\n",
       "      <td>28905ebd</td>\n",
       "      <td>ecad2386</td>\n",
       "      <td>7801e8d9</td>\n",
       "      <td>07d7df22</td>\n",
       "      <td>a99f214a</td>\n",
       "      <td>...</td>\n",
       "      <td>320</td>\n",
       "      <td>50</td>\n",
       "      <td>2616</td>\n",
       "      <td>0</td>\n",
       "      <td>35</td>\n",
       "      <td>100083</td>\n",
       "      <td>51</td>\n",
       "      <td>31</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>10000554139829213984</td>\n",
       "      <td>1005</td>\n",
       "      <td>0</td>\n",
       "      <td>1fbe01fe</td>\n",
       "      <td>f3845767</td>\n",
       "      <td>28905ebd</td>\n",
       "      <td>ecad2386</td>\n",
       "      <td>7801e8d9</td>\n",
       "      <td>07d7df22</td>\n",
       "      <td>a99f214a</td>\n",
       "      <td>...</td>\n",
       "      <td>320</td>\n",
       "      <td>50</td>\n",
       "      <td>2616</td>\n",
       "      <td>0</td>\n",
       "      <td>35</td>\n",
       "      <td>100083</td>\n",
       "      <td>51</td>\n",
       "      <td>31</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>10001094637809798845</td>\n",
       "      <td>1005</td>\n",
       "      <td>0</td>\n",
       "      <td>85f751fd</td>\n",
       "      <td>c4e18dd6</td>\n",
       "      <td>50e219e0</td>\n",
       "      <td>51cedd4e</td>\n",
       "      <td>aefc06bd</td>\n",
       "      <td>0f2161f8</td>\n",
       "      <td>a99f214a</td>\n",
       "      <td>...</td>\n",
       "      <td>320</td>\n",
       "      <td>50</td>\n",
       "      <td>1092</td>\n",
       "      <td>3</td>\n",
       "      <td>809</td>\n",
       "      <td>100156</td>\n",
       "      <td>61</td>\n",
       "      <td>31</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>10001377041558670745</td>\n",
       "      <td>1005</td>\n",
       "      <td>0</td>\n",
       "      <td>85f751fd</td>\n",
       "      <td>c4e18dd6</td>\n",
       "      <td>50e219e0</td>\n",
       "      <td>9c13b419</td>\n",
       "      <td>2347f47a</td>\n",
       "      <td>f95efa07</td>\n",
       "      <td>a99f214a</td>\n",
       "      <td>...</td>\n",
       "      <td>320</td>\n",
       "      <td>50</td>\n",
       "      <td>2667</td>\n",
       "      <td>0</td>\n",
       "      <td>47</td>\n",
       "      <td>-1</td>\n",
       "      <td>221</td>\n",
       "      <td>31</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 25 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                     id    C1  banner_pos   site_id site_domain site_category  \\\n",
       "0  10000174058809263569  1005           0  235ba823    f6ebf28e      f028772b   \n",
       "1  10000182526920855428  1005           0  1fbe01fe    f3845767      28905ebd   \n",
       "2  10000554139829213984  1005           0  1fbe01fe    f3845767      28905ebd   \n",
       "3  10001094637809798845  1005           0  85f751fd    c4e18dd6      50e219e0   \n",
       "4  10001377041558670745  1005           0  85f751fd    c4e18dd6      50e219e0   \n",
       "\n",
       "     app_id app_domain app_category device_id  ...  C15 C16   C17  C18  C19  \\\n",
       "0  ecad2386   7801e8d9     07d7df22  a99f214a  ...  320  50   761    3  175   \n",
       "1  ecad2386   7801e8d9     07d7df22  a99f214a  ...  320  50  2616    0   35   \n",
       "2  ecad2386   7801e8d9     07d7df22  a99f214a  ...  320  50  2616    0   35   \n",
       "3  51cedd4e   aefc06bd     0f2161f8  a99f214a  ...  320  50  1092    3  809   \n",
       "4  9c13b419   2347f47a     f95efa07  a99f214a  ...  320  50  2667    0   47   \n",
       "\n",
       "      C20  C21  date  time_period  weekday  \n",
       "0  100075   23    31            0        1  \n",
       "1  100083   51    31            0        1  \n",
       "2  100083   51    31            0        1  \n",
       "3  100156   61    31            0        1  \n",
       "4      -1  221    31            0        1  \n",
       "\n",
       "[5 rows x 25 columns]"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-03-12T02:28:48.761983Z",
     "start_time": "2019-03-12T02:27:50.566240Z"
    }
   },
   "outputs": [],
   "source": [
    "##==================== Create 'C15_C16' feature ====================##\n",
    "\n",
    "# C15_C16 = C15 - C16 in order to combine features C15 and C16 to create a new feature;\n",
    "train['C15_C16'] = train['C15']-train['C16'] \n",
    "# Delete the 'C15' 和 ‘C16’ features.\n",
    "train.drop(['C15','C16'],axis=1,inplace=True)\n",
    "\n",
    "# C15_C16 = C15 - C16 in order to combine features C15 and C16 to create a new feature;\n",
    "test['C15_C16'] = test['C15']-test['C16'] \n",
    "# Delete the 'C15' 和 ‘C16’ features.\n",
    "test.drop(['C15','C16'],axis=1,inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-03-12T02:29:09.699531Z",
     "start_time": "2019-03-12T02:29:09.466238Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>click</th>\n",
       "      <th>C1</th>\n",
       "      <th>banner_pos</th>\n",
       "      <th>site_id</th>\n",
       "      <th>site_domain</th>\n",
       "      <th>site_category</th>\n",
       "      <th>app_id</th>\n",
       "      <th>app_domain</th>\n",
       "      <th>app_category</th>\n",
       "      <th>...</th>\n",
       "      <th>C14</th>\n",
       "      <th>C17</th>\n",
       "      <th>C18</th>\n",
       "      <th>C19</th>\n",
       "      <th>C20</th>\n",
       "      <th>C21</th>\n",
       "      <th>date</th>\n",
       "      <th>time_period</th>\n",
       "      <th>weekday</th>\n",
       "      <th>C15_C16</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1000009418151094273</td>\n",
       "      <td>0</td>\n",
       "      <td>1005</td>\n",
       "      <td>0</td>\n",
       "      <td>1fbe01fe</td>\n",
       "      <td>f3845767</td>\n",
       "      <td>28905ebd</td>\n",
       "      <td>ecad2386</td>\n",
       "      <td>7801e8d9</td>\n",
       "      <td>07d7df22</td>\n",
       "      <td>...</td>\n",
       "      <td>15706</td>\n",
       "      <td>1722</td>\n",
       "      <td>0</td>\n",
       "      <td>35</td>\n",
       "      <td>-1</td>\n",
       "      <td>79</td>\n",
       "      <td>21</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>270</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>10000169349117863715</td>\n",
       "      <td>0</td>\n",
       "      <td>1005</td>\n",
       "      <td>0</td>\n",
       "      <td>1fbe01fe</td>\n",
       "      <td>f3845767</td>\n",
       "      <td>28905ebd</td>\n",
       "      <td>ecad2386</td>\n",
       "      <td>7801e8d9</td>\n",
       "      <td>07d7df22</td>\n",
       "      <td>...</td>\n",
       "      <td>15704</td>\n",
       "      <td>1722</td>\n",
       "      <td>0</td>\n",
       "      <td>35</td>\n",
       "      <td>100084</td>\n",
       "      <td>79</td>\n",
       "      <td>21</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>270</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>10000371904215119486</td>\n",
       "      <td>0</td>\n",
       "      <td>1005</td>\n",
       "      <td>0</td>\n",
       "      <td>1fbe01fe</td>\n",
       "      <td>f3845767</td>\n",
       "      <td>28905ebd</td>\n",
       "      <td>ecad2386</td>\n",
       "      <td>7801e8d9</td>\n",
       "      <td>07d7df22</td>\n",
       "      <td>...</td>\n",
       "      <td>15704</td>\n",
       "      <td>1722</td>\n",
       "      <td>0</td>\n",
       "      <td>35</td>\n",
       "      <td>100084</td>\n",
       "      <td>79</td>\n",
       "      <td>21</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>270</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>10000640724480838376</td>\n",
       "      <td>0</td>\n",
       "      <td>1005</td>\n",
       "      <td>0</td>\n",
       "      <td>1fbe01fe</td>\n",
       "      <td>f3845767</td>\n",
       "      <td>28905ebd</td>\n",
       "      <td>ecad2386</td>\n",
       "      <td>7801e8d9</td>\n",
       "      <td>07d7df22</td>\n",
       "      <td>...</td>\n",
       "      <td>15706</td>\n",
       "      <td>1722</td>\n",
       "      <td>0</td>\n",
       "      <td>35</td>\n",
       "      <td>100084</td>\n",
       "      <td>79</td>\n",
       "      <td>21</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>270</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>10000679056417042096</td>\n",
       "      <td>0</td>\n",
       "      <td>1005</td>\n",
       "      <td>1</td>\n",
       "      <td>fe8cc448</td>\n",
       "      <td>9166c161</td>\n",
       "      <td>0569f928</td>\n",
       "      <td>ecad2386</td>\n",
       "      <td>7801e8d9</td>\n",
       "      <td>07d7df22</td>\n",
       "      <td>...</td>\n",
       "      <td>18993</td>\n",
       "      <td>2161</td>\n",
       "      <td>0</td>\n",
       "      <td>35</td>\n",
       "      <td>-1</td>\n",
       "      <td>157</td>\n",
       "      <td>21</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>270</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 25 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                     id  click    C1  banner_pos   site_id site_domain  \\\n",
       "0   1000009418151094273      0  1005           0  1fbe01fe    f3845767   \n",
       "1  10000169349117863715      0  1005           0  1fbe01fe    f3845767   \n",
       "2  10000371904215119486      0  1005           0  1fbe01fe    f3845767   \n",
       "3  10000640724480838376      0  1005           0  1fbe01fe    f3845767   \n",
       "4  10000679056417042096      0  1005           1  fe8cc448    9166c161   \n",
       "\n",
       "  site_category    app_id app_domain app_category  ...    C14   C17 C18  C19  \\\n",
       "0      28905ebd  ecad2386   7801e8d9     07d7df22  ...  15706  1722   0   35   \n",
       "1      28905ebd  ecad2386   7801e8d9     07d7df22  ...  15704  1722   0   35   \n",
       "2      28905ebd  ecad2386   7801e8d9     07d7df22  ...  15704  1722   0   35   \n",
       "3      28905ebd  ecad2386   7801e8d9     07d7df22  ...  15706  1722   0   35   \n",
       "4      0569f928  ecad2386   7801e8d9     07d7df22  ...  18993  2161   0   35   \n",
       "\n",
       "      C20  C21  date  time_period  weekday  C15_C16  \n",
       "0      -1   79    21            0        1      270  \n",
       "1  100084   79    21            0        1      270  \n",
       "2  100084   79    21            0        1      270  \n",
       "3  100084   79    21            0        1      270  \n",
       "4      -1  157    21            0        1      270  \n",
       "\n",
       "[5 rows x 25 columns]"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-03-12T02:29:13.488043Z",
     "start_time": "2019-03-12T02:29:13.441220Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>C1</th>\n",
       "      <th>banner_pos</th>\n",
       "      <th>site_id</th>\n",
       "      <th>site_domain</th>\n",
       "      <th>site_category</th>\n",
       "      <th>app_id</th>\n",
       "      <th>app_domain</th>\n",
       "      <th>app_category</th>\n",
       "      <th>device_id</th>\n",
       "      <th>...</th>\n",
       "      <th>C14</th>\n",
       "      <th>C17</th>\n",
       "      <th>C18</th>\n",
       "      <th>C19</th>\n",
       "      <th>C20</th>\n",
       "      <th>C21</th>\n",
       "      <th>date</th>\n",
       "      <th>time_period</th>\n",
       "      <th>weekday</th>\n",
       "      <th>C15_C16</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>10000174058809263569</td>\n",
       "      <td>1005</td>\n",
       "      <td>0</td>\n",
       "      <td>235ba823</td>\n",
       "      <td>f6ebf28e</td>\n",
       "      <td>f028772b</td>\n",
       "      <td>ecad2386</td>\n",
       "      <td>7801e8d9</td>\n",
       "      <td>07d7df22</td>\n",
       "      <td>a99f214a</td>\n",
       "      <td>...</td>\n",
       "      <td>8330</td>\n",
       "      <td>761</td>\n",
       "      <td>3</td>\n",
       "      <td>175</td>\n",
       "      <td>100075</td>\n",
       "      <td>23</td>\n",
       "      <td>31</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>270</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>10000182526920855428</td>\n",
       "      <td>1005</td>\n",
       "      <td>0</td>\n",
       "      <td>1fbe01fe</td>\n",
       "      <td>f3845767</td>\n",
       "      <td>28905ebd</td>\n",
       "      <td>ecad2386</td>\n",
       "      <td>7801e8d9</td>\n",
       "      <td>07d7df22</td>\n",
       "      <td>a99f214a</td>\n",
       "      <td>...</td>\n",
       "      <td>22676</td>\n",
       "      <td>2616</td>\n",
       "      <td>0</td>\n",
       "      <td>35</td>\n",
       "      <td>100083</td>\n",
       "      <td>51</td>\n",
       "      <td>31</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>270</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>10000554139829213984</td>\n",
       "      <td>1005</td>\n",
       "      <td>0</td>\n",
       "      <td>1fbe01fe</td>\n",
       "      <td>f3845767</td>\n",
       "      <td>28905ebd</td>\n",
       "      <td>ecad2386</td>\n",
       "      <td>7801e8d9</td>\n",
       "      <td>07d7df22</td>\n",
       "      <td>a99f214a</td>\n",
       "      <td>...</td>\n",
       "      <td>22676</td>\n",
       "      <td>2616</td>\n",
       "      <td>0</td>\n",
       "      <td>35</td>\n",
       "      <td>100083</td>\n",
       "      <td>51</td>\n",
       "      <td>31</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>270</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>10001094637809798845</td>\n",
       "      <td>1005</td>\n",
       "      <td>0</td>\n",
       "      <td>85f751fd</td>\n",
       "      <td>c4e18dd6</td>\n",
       "      <td>50e219e0</td>\n",
       "      <td>51cedd4e</td>\n",
       "      <td>aefc06bd</td>\n",
       "      <td>0f2161f8</td>\n",
       "      <td>a99f214a</td>\n",
       "      <td>...</td>\n",
       "      <td>18648</td>\n",
       "      <td>1092</td>\n",
       "      <td>3</td>\n",
       "      <td>809</td>\n",
       "      <td>100156</td>\n",
       "      <td>61</td>\n",
       "      <td>31</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>270</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>10001377041558670745</td>\n",
       "      <td>1005</td>\n",
       "      <td>0</td>\n",
       "      <td>85f751fd</td>\n",
       "      <td>c4e18dd6</td>\n",
       "      <td>50e219e0</td>\n",
       "      <td>9c13b419</td>\n",
       "      <td>2347f47a</td>\n",
       "      <td>f95efa07</td>\n",
       "      <td>a99f214a</td>\n",
       "      <td>...</td>\n",
       "      <td>23160</td>\n",
       "      <td>2667</td>\n",
       "      <td>0</td>\n",
       "      <td>47</td>\n",
       "      <td>-1</td>\n",
       "      <td>221</td>\n",
       "      <td>31</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>270</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 24 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                     id    C1  banner_pos   site_id site_domain site_category  \\\n",
       "0  10000174058809263569  1005           0  235ba823    f6ebf28e      f028772b   \n",
       "1  10000182526920855428  1005           0  1fbe01fe    f3845767      28905ebd   \n",
       "2  10000554139829213984  1005           0  1fbe01fe    f3845767      28905ebd   \n",
       "3  10001094637809798845  1005           0  85f751fd    c4e18dd6      50e219e0   \n",
       "4  10001377041558670745  1005           0  85f751fd    c4e18dd6      50e219e0   \n",
       "\n",
       "     app_id app_domain app_category device_id  ...    C14   C17  C18  C19  \\\n",
       "0  ecad2386   7801e8d9     07d7df22  a99f214a  ...   8330   761    3  175   \n",
       "1  ecad2386   7801e8d9     07d7df22  a99f214a  ...  22676  2616    0   35   \n",
       "2  ecad2386   7801e8d9     07d7df22  a99f214a  ...  22676  2616    0   35   \n",
       "3  51cedd4e   aefc06bd     0f2161f8  a99f214a  ...  18648  1092    3  809   \n",
       "4  9c13b419   2347f47a     f95efa07  a99f214a  ...  23160  2667    0   47   \n",
       "\n",
       "      C20  C21  date  time_period  weekday  C15_C16  \n",
       "0  100075   23    31            0        1      270  \n",
       "1  100083   51    31            0        1      270  \n",
       "2  100083   51    31            0        1      270  \n",
       "3  100156   61    31            0        1      270  \n",
       "4      -1  221    31            0        1      270  \n",
       "\n",
       "[5 rows x 24 columns]"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-03-12T02:40:32.077073Z",
     "start_time": "2019-03-12T02:29:16.847856Z"
    }
   },
   "outputs": [],
   "source": [
    "# 保存新的数据集\n",
    "train.to_csv('group_train.csv', index = False)\n",
    "test.to_csv('group_test.csv', index = False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
